Our goal will be to produce an equivalence table relating year 2010 Bay Area Census Tracts to MTC’s Transportation Analysis Zones (TAZ).
We download and read TAZ Data from MTC’s open data portal.
library(sf)
library(dplyr)
library(readr)
library(mapview)
taz1454 <- st_read("https://opendata.arcgis.com/datasets/b85ba4d43f9843128d3542260d9a2f1f_0.geojson")
taz1454 <- dplyr::select(taz1454,TAZ1454)
taz1454 <- dplyr::rename(taz1454, taz = TAZ1454)
taz1454 <- sf::st_transform(taz1454, crs=26910)
See methods for more detail.
Also, Crop water out of tracts. TAZ’s do not use water.
tracts <- dplyr::select(tracts,TRACTCE10)
tracts <- dplyr::rename(tracts,tract = TRACTCE10)
tracts <- st_transform(tracts, crs=26910)
detach("package:tigris", unload=TRUE)
knitr::kable(table(st_is_valid(tracts)))
bay_water <- st_read("https://geo.nyu.edu/download/file/stanford-mb777jk0330-geojson.json")
bay_water <- bay_water[st_is_valid(bay_water),]
bay_water <- st_transform(bay_water, crs=26910)
st_erase = function(x, y) st_difference(x, st_union(st_combine(y)))
tracts <- st_erase(tracts,bay_water)
#knitr::kable(table(st_is_valid(tracts)))
tracts$tract_area <- st_area(tracts)
See methods for more detail.
See methods for more detail.
library(sf)
intersection_df <- st_intersection(tracts,taz1454)
intersection_df$intersection_area <- as.numeric(st_area(intersection_df$geometry))
intersection_df$intersection_ratio <- as.numeric(intersection_df$intersection_area/intersection_df$tract_area)
For a more detailed review of those methods see ’reverse_engineer_tract_zone_2000_methods.Rmd"
Transportation Analysis Zones (TAZ) and Tracts do note share a single topology (e.g. road network).
For example, there are many instances in which TAZ boundaries are drawn 50 meters into a Census Tract.
This becomes clear when we look at a selection of the intersecting features on the map with very small areas relative to the length of their outer boundary.
intersection_df$length_to_area_ratio <- as.numeric(st_length(st_cast(intersection_df,'MULTILINESTRING'))/st_area(intersection_df))
q1 <- quantile(intersection_df$length_to_area_ratio, c(.25,.5))
intersection_df$definitely_a_sliver <- intersection_df$length_to_area_ratio>q1[2]
intersection_df$probably_a_sliver <- intersection_df$length_to_area_ratio>q1[1]
sliver_data <- intersection_df[intersection_df$definitely_a_sliver==TRUE,]
sliver_sample <- sliver_data[sample(nrow(sliver_data), 200),]
mapview(sliver_sample, color="red", map.types=c('Stamen.Toner.Light'))
These slivers of geometries that happen when we spatially intersect these two datasets. These slivers contain no information other than the fact that the geometries do not share a common topology. They account
We’ll flag these on the table as an attribute so we can consider throwing these relationships out later.
I played around with a few thresholds cartographically and .5 seems like a conservative cutoff for slivers of geometries that just based on drawing errors. Anything lower than .4 ends up with some slivers with houses in them.
We looked at the year 2000 equivalence table to get an understanding of rules we might use to think of equivalence.
The first rule that seems reasonable based on a more detailed analysis of the methods used in 2000 was to say that if a TAZ made up 15% of the area of a tract or more then that TAZ was part of a set of TAZ’s that were equivalent to the tract.
Lets take a look at a sample of which equivalencies would be tossed out for intersection using the .15 ratio of TAZ intersection area to Census Tract area method.
intersection_df_smpl <- intersection_df[sample(nrow(intersection_df), 100),]
valid_intersections_bool<- as.numeric(intersection_df_smpl$intersection_ratio)>.15
tract_id <- intersection_df_smpl[!valid_intersections_bool,]$tract
taz_id <- intersection_df_smpl[!valid_intersections_bool,]$taz
tracts1 <- tracts[tracts$tract %in% tract_id,]
taz1 <- taz1454[taz1454$taz %in% taz_id,]
intersection1 <- intersection_df_smpl[!valid_intersections_bool,]
mapview(tracts1, color="green", alpha=0.8, map.types=c('Stamen.Toner.Light')) +
mapview(taz1, color="blue", alpha=0.8, map.types=c('Stamen.Toner.Light')) +
mapview(intersection1, color="red", alpha=0.8, map.types=c('Stamen.Toner.Light'))
#mapshot(p1, "taz_ratio_of_tract_area_map.html", selfcontained = FALSE)
We have a number of instances in which the “Probably a Sliver” flag can be used to rule out relationships that are not substantive.
However, given that some Tracts have TAZ’s within them that are less than 15% of the area of the Tract, there were TAZ’s that were not matched to tracts based on this rule. (e.g. TAZ 1 and 3)
Therefore the 15% of total ratio didn’t capture all equivalence.
So we tried a method for assessing equivalence that attempts to make sure the sum of the areas of the TAZ’s that are “equivalent” to a Tract are equivalent to the area of the Tract.
Lets do the same and inspect which equivalencies be tossed out based on this for area equivalence definition.
library(dplyr)
intersection_df_no_sliver <- intersection_df[!intersection_df$definitely_a_sliver==TRUE,]
tracts_zones_area <- intersection_df_no_sliver %>%
group_by(tract) %>%
summarise(taz_area_by_tract = sum(intersection_area))
tracts_temp <- tracts
tracts_temp$tract <- as.integer(tracts_temp$tract)
st_geometry(tracts_temp) <- NULL
tracts_zones_area$tract <- as.integer(tracts_zones_area$tract)
st_geometry(tracts_zones_area) <- NULL
tracts_zones_area <- left_join(tracts_temp,tracts_zones_area, by="tract")
tracts_zones_area$taz_area_over_tract_area <- as.numeric(tracts_zones_area$taz_area_by_tract)/as.numeric(tracts_zones_area$tract_area)
print(summary(tracts_zones_area$taz_area_over_tract_area))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.05197 0.98524 0.99420 1.15572 0.99876 270.42225
q2 <- quantile(tracts_zones_area$taz_area_over_tract_area, c(.01,.99), na.rm=TRUE)
tract_ids <- tracts_zones_area[(tracts_zones_area$taz_area_over_tract_area < q2[1] |
tracts_zones_area$taz_area_over_tract_area > q2[2]),]$tract
taz_overlap_ids <- intersection_df_no_sliver[intersection_df_no_sliver$tract %in% tract_ids,]$taz
tracts1 <- tracts[tracts$tract %in% tract_ids,]
taz1 <- taz1454[taz1454$taz %in% taz_overlap_ids,]
intersection1 <- intersection_df_no_sliver[intersection_df_no_sliver$tract %in% tract_ids,]
mapview(tracts1, color="green", alpha=0.8, map.types=c('Stamen.Toner.Light')) +
mapview(taz1, color="blue", alpha=0.8, map.types=c('Stamen.Toner.Light')) +
mapview(intersection1, color="red", alpha=0.8, map.types=c('Stamen.Toner.Light'))
Some conclusions based on reviewing this map.
Many show evidence of a shifted coordinate system or transformation in the TAZ data that have made them hard to compare. These could be thrown out using the “probably a sliver” flag.
However, there are other kinds of instances in which we would throw out a valid intersection with the sum of constituent areas. Lets have a look at just examples of those.
tract_ids_non_topological <- c(392200,252914,441523,403502)
print(table(tract_ids_non_topological %in% tract_ids))
##
## FALSE
## 4
tract_ids <- tract_ids_non_topological
taz_overlap_ids <- intersection_df_no_sliver[intersection_df_no_sliver$tract %in% tract_ids,]$taz
tracts2 <- tracts[tracts$tract %in% tract_ids,]
taz2 <- taz1454[taz1454$taz %in% taz_overlap_ids,]
intersection2 <- intersection_df_no_sliver[intersection_df_no_sliver$tract %in% tract_ids,]
mapview(tracts2, color="green", alpha=0.8, map.types=c('Stamen.Toner.Light')) +
mapview(taz2, color="blue", alpha=0.8, map.types=c('Stamen.Toner.Light')) +
mapview(intersection2, color="red", alpha=0.8, map.types=c('Stamen.Toner.Light'))
#mapshot(p2, "sum_of_areas_map.html", selfcontained = FALSE)
Others show areas where very large TAZ’s have been drawn around tracts. So it seems like there is no consistent rule that TAZ’s are always smaller than a tract. Sometimes, they are larger.
For example, Tract 441523, which is a tiny piece of TAZ 769. Similar situation for Tract 252914.
In other cases, a tract may have been split in half by a TAZ. For example, tract 403502, north of Lake Merritt in Oakland. Its almost split down the middle by TAZ 972.
Given that some Tracts are far smaller than their TAZ’s, we can’t say that in all cases the sum of the areas of a TAZ should match the area of a tract.
One way to better understand what method we might use is to see how these particular tracts were dealt with in the year 2000 lookup.
First we have to grab year 2000 census data and intersect it with TAZ data. For this I use the same process as above, but change the year argument to the tigris package.
knitr::opts_chunk$set(echo = TRUE)
options(tigris_use_cache=TRUE)
counties=c("01","13","41","55","75","81","85","95","97")
tracts00 <- tigris::tracts("CA", counties, class="sf", year=2000)
tracts00 <- dplyr::select(tracts00,TRACTCE00)
tracts00 <- dplyr::rename(tracts00,tract = TRACTCE00)
knitr::kable(table(tracts00$tract %in% tract_ids_non_topological))
| Var1 | Freq |
|---|---|
| FALSE | 1405 |
knitr::kable(table(tracts$tract %in% tract_ids_non_topological))
| Var1 | Freq |
|---|---|
| FALSE | 1582 |
| TRUE | 4 |
Apparently the tracts manually identified in the review process as instances of exceptions to our assumptions about equivalence above are new for the year 2010 census.
Its tempting to use the year 2000 table as a basis for lookups, and then just add new intersections based on new tracts from 2000 to 2010.
One potential shortcoming of this method is that there is one known errors in the year 2000 lookup table.
Tract 500300 was mapped to two TAZ’s, one of which is not adjacent or near it. It was also not mapped to a TAZ that we would have expected it to be mapped to based on the ratio rule applied above.
So there is some degree (uknown how much) of error in that table. Nevertheless, it may be useful for comparison with whichever method we use.
It seems like the constituent areas definition has fewer false negatives than the ratio method. It may have more false positives, so we’ll throw out tract/taz intersections that are “probably a sliver”
intersection_no_s <- intersection_df[intersection_df$probably_a_sliver==FALSE,]
Again, we will hand review based on the ratio of the taz’s area to their tract.
tracts_zones_area <- intersection_no_s %>%
group_by(tract) %>%
summarise(taz_area_by_tract = sum(intersection_area))
tracts_temp <- tracts
st_geometry(tracts_temp) <- NULL
st_geometry(tracts_zones_area) <- NULL
tracts_zones_area <- left_join(tracts_temp,tracts_zones_area, by="tract")
tracts_zones_area$taz_area_over_tract_area <- as.numeric(tracts_zones_area$taz_area_by_tract)/as.numeric(tracts_zones_area$tract_area)
q2 <- quantile(tracts_zones_area$taz_area_over_tract_area, c(.05,.95), na.rm=TRUE)
tract_ids <- tracts_zones_area[(tracts_zones_area$taz_area_over_tract_area < q2[1] |
tracts_zones_area$taz_area_over_tract_area > q2[2]),]$tract
taz_overlap_ids <- intersection_no_s[intersection_no_s$tract %in% tract_ids,]$taz
tracts_w_taz_area <- left_join(tracts,tracts_zones_area, by='tract')
tracts3 <- tracts_w_taz_area[tracts_w_taz_area$tract %in% tract_ids,]
taz3 <- taz1454[taz1454$taz %in% taz_overlap_ids,]
intersection3 <- intersection_no_s[intersection_no_s$tract %in% tract_ids,]
mapview(tracts3, color="green", alpha=0.8, map.types=c('Stamen.Toner.Light')) +
mapview(taz3, color="blue", alpha=0.8, map.types=c('Stamen.Toner.Light')) +
mapview(intersection3, color="red", alpha=0.8, map.types=c('Stamen.Toner.Light'))
print(q2)
## 5% 95%
## 0.9155454 0.9979694
Finally, lets output a table in the expected sparse format.
st_geometry(intersection_no_s) <- NULL
tract_zone_2010_dense <- dplyr::select(intersection_no_s,taz,tract)
knitr::kable(head(tract_zone_2010_dense))
| taz | tract | |
|---|---|---|
| 1251 | 566 | 503115 |
| 1251.1 | 565 | 503115 |
| 1253.1 | 565 | 503116 |
| 1317 | 573 | 503207 |
| 1284 | 571 | 503208 |
| 927 | 576 | 503211 |
tract_zone_2010_dense$num <- ave(tract_zone_2010_dense[['taz']],
tract_zone_2010_dense[['tract']],
FUN = seq_along)
tract_zone_2010_dense$header_string <- 'rtaz'
tract_zone_2010_sparse <- tract_zone_2010_dense %>%
tidyr::unite("header_string",
header_string,
num) %>%
tidyr::spread(header_string, taz)
#knitr::kable(head(tract_zone_2010_sparse,n=20))
write_csv(tract_zone_2010_sparse,'Tract_zone_2010.csv')